一个用于网站页面输出HTML压缩的算法。
using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.IO;using System.Text.RegularExpressions;using System.Threading;using System.Diagnostics;namespace Inno.Infrastructure.Utilities{ /// <summary> /// An fast html compressor tool that can remove unecessary whitespaces, comments or compress /// the inline scripts or css styles from within html streams. /// </summary> public class HtmlCompressor { private StreamReader _input; private StreamWriter _output; private Stream _outputStream; private long _originalContentLength = 0; private long _compressedContentLength = 0;#if DEBUG private StringBuilder _debugOutput = new StringBuilder();#endif private char _tok; private const int EOF = -1; /// <summary> /// Create a default html compressor w/out input and output stream specified. /// However, the two streams must be specified before compress can be called. /// </summary> public HtmlCompressor() { } /// <summary> /// Create a compressor with specified input and output stream. /// </summary> /// <param name="input"></param> /// <param name="output"></param> public HtmlCompressor(StreamReader input, StreamWriter output) { this._input = input; this._output = output; } /// <summary> /// Create a compressor with specified output stream. /// </summary> /// <param name="output"></param> public HtmlCompressor(StreamWriter output) { this._output = output; } /// <summary> /// Create a compressor with output stream. /// </summary> /// <param name="outputStream"></param> public HtmlCompressor(Stream outputStream) { this._outputStream = outputStream; } /// <summary> /// Gets or sets the encoding of the compressor uses. /// </summary> public Encoding Encoding { get; set; } /// <summary> /// Gets or sets if the compress radio should be shown. /// </summary> public bool enable_compress_ratio { get; set; } // tag types. enum TagTypes { NONE, CDADA, DOCTYPE, TEXTAREA, PRE, STYLE, SCRIPT, OTHERS } #region state machine states private TagTypes _tagType = TagTypes.NONE; // states private bool _startlt; private long _startltPos; private bool _startTagName; private bool _startTag; private bool _startAttr; private bool _startAttrName; private bool _endAttrName; private bool _startEq; private long _startEqPos = -1; private bool _startAttrValue; private bool _startAttrValueNoQuotes; private char _attrQuoteChar; private bool _lookForEndTag; private bool _startWsEndTag; private StringBuilder _endTagNameBuf = new StringBuilder(); private string _endTagName; private bool _blockEndTag; private bool _lookAheadEndOfTagGt; private bool _startEscape; private long _lastEscapePos; private int _escapeCount; private bool _startPI; // start processing instruction <! private bool _startComment; private bool _startCDATA; private int _cdataCloseBracketCount = 0; private long _cdataCloseBracketPos = -1; private int _endOfCommentHyphenCount; private long _lastHyphenPos; private StringBuilder _cdataTagName = new StringBuilder(); // tag name buffer. private StringBuilder _tagNameBuf = new StringBuilder(256); private string _tagName; // text between tags. private bool _lookForNextTag = true; private StringBuilder _textNodeContent = new StringBuilder(); private bool _startWsContent; private bool _delayWsContent; // if the content should be delayed. private StringBuilder _wsContent = new StringBuilder(); private long _endOfTagLtPos; // simple state machine to by pass javascript/css comments and strings. private bool _startJsComment; private bool _isJsBlockComment; private bool _startJsString; private char _jsStringQuoteChar; private bool _lookAheadStartJsComment; private bool _lookAheadEndOfJsBlockComment; private bool _endOfTagSeen; private long _endOfJsBlockCommentLookAheadPos; #endregion #region helper methods // read next token from the stream and put the token in the _tok field. long _pointer = 0; private bool next() { _pointer ; var b = _input.Read(); if (b == EOF) { return false; } _tok = (char)b; if (this.enable_compress_ratio) { var encoding = this.Encoding ?? Encoding.Default; var c = encoding.GetByteCount(new char[] { _tok }); _originalContentLength = c; } return true; } private void skip() { while (next() && char.IsWhiteSpace(_tok)) ; } enum TokenState { StartTag, } private HtmlCompressor echo() { return echo(_tok); } private HtmlCompressor echo(char ch) { if (_delayWsContent) { _wsContent.Append(ch); } else { if (null != _output) _output.Write(ch); else if (null != _outputStream) { var encoding = null != this.Encoding ? this.Encoding : Encoding.Default; var bytes = encoding.GetBytes(new char[] { ch }); _outputStream.Write(bytes, 0, bytes.Length); if (this.enable_compress_ratio) { _compressedContentLength = bytes.Length; } }#if DEBUG _debugOutput.Append(ch);#endif } return this; } private HtmlCompressor echo(string s) { if (_delayWsContent) { _wsContent.Append(s); } else { if (null != _output) _output.Write(s); else if (null != _outputStream) { var encoding = null != this.Encoding ? this.Encoding : Encoding.Default; var bytes = encoding.GetBytes(s); //if (bytes.Length > 0) { _outputStream.Write(bytes, 0, bytes.Length); if (this.enable_compress_ratio) { _compressedContentLength = bytes.Length; } } }#if DEBUG _debugOutput.Append(s);#endif } return this; } private long pos() { return _pointer - 1; } private bool isws() { return char.IsWhiteSpace(_tok); } private bool iscrlf() { return _tok == '\r' || _tok == '\n'; } private bool in_script_or_css() { return _startWsContent && _delayWsContent; } private static Regex _rExtraSpaces = new Regex("\\s{2,}", RegexOptions.Compiled | RegexOptions.IgnoreCase); private void echo_text_node_content() { echo_text_node_content(false); } private void echo_text_node_content(bool trim) { bool hasContent = false; for (var i = 0; i < _textNodeContent.Length; i ) { if (!char.IsWhiteSpace(_textNodeContent[i])) { hasContent = true; break; } } if (hasContent) { var textNodeContent = _textNodeContent.ToString(); textNodeContent = _rExtraSpaces.Replace(textNodeContent, " "); if (trim) { bool reserveLeadingSpace = false; if (textNodeContent.Length > 0 && textNodeContent[0] == ' ') { reserveLeadingSpace = true; } if (reserveLeadingSpace) { textNodeContent = ' ' textNodeContent.Trim(); } else { textNodeContent = textNodeContent.Trim(); } } // only trim new lines and tabs. else textNodeContent = textNodeContent.Trim('\r', '\n', '\t'); echo(textNodeContent); } else { if (_textNodeContent.Length > 0 && _textNodeContent[0] == ' ') { echo(' '); } } _lookForNextTag = false; _textNodeContent.Remove(0, _textNodeContent.Length); } // compress the specified scripts. private string compress_scripts(string scripts) { try { if (string.IsNullOrEmpty(scripts)) return string.Empty; return MyMin.parse(scripts); } catch {} return scripts; } // compress the specified styles. public string compress_styles(string styles) { try { if (string.IsNullOrEmpty(styles)) return string.Empty; return MyMin.parse(styles, true, true); } catch {} return styles; } #endregion /// <summary> /// Use this method to fill additional stream to the compressor so that /// multiple segments of html content can be compressed serially. /// You don't need to pass a complete html (stream or string) in order to /// compress it, 'cause the compressor is actually a a look-ahead state /// machine, it determines how to action accordinng to the next character /// from the input stream, thus you can fill the compressor with streams /// whiling they're being read from other places such as network, etc. /// </summary> /// <param name="input"></param> public void fill(StreamReader input) { this._input = input; } /// <summary> /// Fill with an array of bytes. /// </summary> /// <param name="input"></param> public void fill(byte[] input, int offset, int count) { MemoryStream stream = new MemoryStream(input, offset, count); StreamReader reader = new StreamReader( stream, null != this.Encoding ? this.Encoding : Encoding.Default ); fill(reader); } /// <summary> /// Compress the given html string. /// </summary> /// <param name="htmlString"></param> public void compress(string htmlString) { var bytes = Encoding.UTF8.GetBytes(htmlString); var memoryStream = new MemoryStream(bytes); this._input = new StreamReader(memoryStream); // start compress. compress(); } /// <summary> /// close compressing process. /// </summary> public void flush() { // collect pending html texts. if (_lookForNextTag) { echo_text_node_content(); }#if DEBUG if (this.enable_compress_ratio && _originalContentLength > 0) { string compress_ratio = string.Format( "<div class='html-compressor-compress-ratio'><i style='font-size:.7em;'>---- compress ratio: {0:F2}%. ----</i></div>", 100 * ((_originalContentLength - _compressedContentLength) * 1.0 / _originalContentLength) ); if (null != _output) _output.Write(compress_ratio); else if (null != _outputStream) { var encoding = null != this.Encoding ? this.Encoding : Encoding.Default; var bytes = encoding.GetBytes(compress_ratio); _outputStream.Write(bytes, 0, bytes.Length); } }#endif } public bool isEndOfDocument { get; private set; } /// <summary> /// Begin compress the specified html stream which can be specified on the constructor or /// by calling the "fill" method. To compress a string, use compress(string htmlString). /// </summary> public void compress() { if (null == this._input || (null == this._output && null == this._outputStream)) { throw new InvalidOperationException("Input and output stream must both be specified. "); } while (next()) { // skip white space. switch (_tok) { /* start tag quote */ case '<': { if (_lookAheadEndOfTagGt) { echo('/'); _lookAheadEndOfTagGt = false; } if (_startComment) continue; else if (_startAttrValue || _startCDATA) { // ouput directly in attribute values and comments. echo(); } else if (_startWsContent) { // because we cann't determine if the token comes // from inside the script/style or indicates a // end of block tag, we have to echo it anyway. echo(); if (!_startJsString && !_startJsComment) { _endOfTagLtPos = pos(); } } else { _startlt = true; _startltPos = pos(); } } break; /* probably comment sign, i.e., <!-- */ case '!': { if (_startlt) { if (_startltPos 1 == pos()) { // start of processing instructions, e.g., // <!--, <![CDATA, <!DOCTYPE, etc. _startPI = true; } else { echo(); } _startlt = false; _startltPos = -1; } else { // otherwise, treat it as a generic character. generic_handler(); } } break; /* probably comment sign, i.e., <!-- */ case '-': { if (_startPI) { _startComment = true; _startPI = false; // reset the hyphen testing flags. _endOfCommentHyphenCount = 0; _lastHyphenPos = -1; } else if (_startComment) { if (_lastHyphenPos 1 != pos()) { _endOfCommentHyphenCount = 0; _lastHyphenPos = -1; } // record for test of end of comment. _endOfCommentHyphenCount ; _lastHyphenPos = pos(); } else { // call generic handler. generic_handler(); } } break; /* probably start of CDATA <![ */ case '[': { if (_startPI) { if (_lookForNextTag) { echo_text_node_content(); } _startCDATA = true; _startPI = false; echo('<').echo('!').echo(); // clear the cdata tag name buffer. _cdataTagName.Remove(0, _cdataTagName.Length); } else if (_startCDATA) { echo(); } else { // call generic handler. generic_handler(); } } break; /* probably end of CDATA section */ case ']': { if (_startCDATA) { echo(); if (_cdataCloseBracketPos 1 != pos()) { _cdataCloseBracketPos = -1; _cdataCloseBracketCount = 0; } _cdataCloseBracketCount ; _cdataCloseBracketPos = pos(); } else // call generic handler. generic_handler(); } break; /* probably end of tag such as </a>, <br />, two forms */ case '/': { if (_lookAheadEndOfTagGt) { echo('/'); _lookAheadEndOfTagGt = false; } if (_startAttrValueNoQuotes) { echo(); continue; } if (_startWsContent) { // skip token in the scripts/css. if (_startJsString) { echo(); } else if (_startJsComment) { echo(); if (_lookAheadEndOfJsBlockComment) { if (_endOfJsBlockCommentLookAheadPos 1 == pos()) { _lookAheadEndOfJsBlockComment = false; _startJsComment = false; _isJsBlockComment = false; } } } else if (_lookAheadStartJsComment) { echo(); // we found the inline comment. _lookAheadStartJsComment = false; _startJsComment = true; _isJsBlockComment = false; } else { // we need read on to determine what to do. echo(); _lookAheadStartJsComment = true; // this flag sets if we have seen the probable end of tag "</". _endOfTagSeen = _endOfTagLtPos 1 == pos(); } } else if (_startAttr && !_startAttrValue || _startTagName) { _blockEndTag = false; _startAttr = false; _startTagName = false; _startEq = false; _startEqPos = -1; if (_startTag || _startTagName) echo(' '); echo(); } else if (_startTag) { // might be end of tag sign '<img />', need look ahead. _lookAheadEndOfTagGt = true; } else if (_startlt) { if (_lookForNextTag) { echo_text_node_content(true); } echo('<').echo(); _startlt = false; _lookForEndTag = true; } else { // call generic handler. generic_handler(); } } break; /* posible start or end of js block comment */ case '*': { if (!_startWsContent) { generic_handler(); continue; } if (_lookAheadStartJsComment) { // we detected a block js comment here. echo(); _startJsComment = true; _isJsBlockComment = true; _lookAheadStartJsComment = false; } else if (_startJsComment && _isJsBlockComment) { echo(); if (!_lookAheadEndOfJsBlockComment) { _lookAheadEndOfJsBlockComment = true; } _endOfJsBlockCommentLookAheadPos = pos(); } else { // call generic handler. generic_handler(); } } break; /* end tag quote */ case '>': { if (_startComment) { if (_lastHyphenPos 1 == pos() && _endOfCommentHyphenCount >= 2) { // end comment. _startComment = false; continue; } } else if (_startTag && !_startAttrValue || _startTagName || _startAttrValueNoQuotes || _lookAheadEndOfTagGt) { // should be <tag> if (_startTagName) { // remember the tag name and clear the tag name buffer. _tagName = _tagNameBuf.ToString().ToUpper(); _tagNameBuf.Remove(0, _tagNameBuf.Length); // determine the tag type, we should handle special // tag names, such as !DOCTYPE, textarea, pre, style, script. if (_tagName == "!DOCTYPE") _tagType = TagTypes.DOCTYPE; else if (_tagName == "TEXTAREA") _tagType = TagTypes.TEXTAREA; else if (_tagName == "PRE") _tagType = TagTypes.PRE; else if (_tagName == "STYLE") _tagType = TagTypes.STYLE; else if (_tagName == "SCRIPT") _tagType = TagTypes.SCRIPT; else _tagType = TagTypes.OTHERS; } // must be a tag close character, reset tag related states. _startTag = false; _startTagName = false; _startAttr = false; _startAttrName = false; _startAttrValue = false; _startEq = false; _startAttrValueNoQuotes = false; _endOfTagLtPos = -1; if (_lookAheadEndOfTagGt) { _lookAheadEndOfTagGt = false; echo(' ').echo('/').echo(); _lookForNextTag = true; } else { echo(); // the text inside the following tags should be preserved, and // CSS and scripts code should be compressed as well. if (_tagType == TagTypes.PRE || _tagType == TagTypes.SCRIPT || _tagType == TagTypes.STYLE || _tagType == TagTypes.TEXTAREA) { _startWsContent = true; _wsContent.Remove(0, _wsContent.Length); _delayWsContent = (_tagType == TagTypes.SCRIPT || _tagType == TagTypes.STYLE); _tagNameBuf.Remove(0, _tagNameBuf.Length); // _tagType = TagTypes.NONE; } else { // _tagType = TagTypes.NONE; _tagNameBuf.Remove(0, _tagNameBuf.Length); _lookForNextTag = true; } } continue; } else if (_lookForEndTag) { echo(); _lookForEndTag = false; _lookForNextTag = true; continue; } else if (_startWsEndTag) { echo(); // close an "end-of-tag" tag, we test the tag name here. // if it is css or script, we get a chance to compress them. if (_blockEndTag) { string endTagName = _endTagNameBuf.ToString(); // test if end of whitespace tag. bool isScript = string.Compare(endTagName, "script", StringComparison.OrdinalIgnoreCase) == 0; bool isStyle = string.Compare(endTagName, "style", StringComparison.OrdinalIgnoreCase) == 0; if (!(_tagType == TagTypes.TEXTAREA && 0 == string.Compare(endTagName, "textarea", StringComparison.OrdinalIgnoreCase) || _tagType == TagTypes.PRE && 0 == string.Compare(endTagName, "pre", StringComparison.OrdinalIgnoreCase) || isScript || isStyle)) { _startWsEndTag = false; _blockEndTag = false; _endTagNameBuf.Remove(0, _endTagNameBuf.Length); continue; } if (isScript || isStyle) { string wsContent = _wsContent.ToString(); int index = wsContent.LastIndexOf("</" endTagName ">", StringComparison.OrdinalIgnoreCase); wsContent = wsContent.Substring(0, index).Trim(); // do compression. if (isScript) { wsContent = compress_scripts(wsContent); } else { wsContent = compress_styles(wsContent); } _delayWsContent = false; if (wsContent.Length > 0) { echo("/*<![CDATA[*/"); // make it XHTML compatible echo(wsContent); echo("/*]]>*/"); } echo("</" endTagName ">"); } _wsContent.Remove(0, _wsContent.Length); _endTagNameBuf.Remove(0, _endTagNameBuf.Length); _endTagName = string.Empty; } _startWsContent = false; _blockEndTag = false; _startWsEndTag = false; _tagType = TagTypes.NONE; // reset the preserve whitespace tag type. continue; } else if (_startlt) { _startlt = false; _startltPos = -1; echo(); continue; } else if (_startCDATA) { echo(); if (_cdataCloseBracketCount >= 1) { // end of a CDATA section, reset the CDATA states. _startCDATA = false; } continue; } // call generic handler.generic_handler(); } break; /* eq sign for attributes */ case '=': { if (_startAttrName || _endAttrName) { echo(); _startAttrName = false; _endAttrName = false; _startEq = true; _startEqPos = pos(); continue; } // call generic handler.generic_handler(); } break; /* quotes */ case '\'': case '"': { if (_lookAheadEndOfTagGt) { echo('/'); _lookAheadEndOfTagGt = false; } if (_startTag) { if (_startAttrValue) { // end of attribute value. if (_tok == _attrQuoteChar) { echo(); _startAttrValue = false; _startAttr = false; } else { echo(); } } else { // a special attribute w/out attribute name. if (!_startEq) { echo(' '); _startAttr = true; } _startAttrName = false; _endAttrName = false; _startEq = false; _startEqPos = -1; _startAttrValue = true; _attrQuoteChar = _tok; echo(); } } else if (in_script_or_css()) { echo(); if (_startJsString) { if (_startEscape && pos() != _lastEscapePos 1) { _startEscape = false; } if (_tok == _jsStringQuoteChar) { // determine if this the end of the string. if (!_startEscape || _escapeCount % 2 == 0) { _startJsString = false; } } } else { _startJsString = true; _jsStringQuoteChar = _tok; } } else // call generic handler. generic_handler(); } break; /* escape character in quotes */ case '\\': if (_startJsString) { if (_startEscape && pos() != _lastEscapePos 1) { _startEscape = false; } if (!_startEscape) { _startEscape = true; _escapeCount = 1; } else { _escapeCount ; } _lastEscapePos = pos(); echo(); } else { // call generic handler.generic_handler(); } break; /* whitespace */ case ' ': case '\t': case '\r': case '\n': { if (_startlt) continue; // skip.else if (_startTagName) { _startTagName = false; _startTag = true; // tag name found, start tag. // remember the tag name and clear the tag name buffer. _tagName = _tagNameBuf.ToString().ToUpper(); _tagNameBuf.Remove(0, _tagNameBuf.Length); // determine the tag type, we should handle special // tag names, such as !DOCTYPE, textarea, pre, style, script. if (_tagName == "!DOCTYPE") _tagType = TagTypes.DOCTYPE; else if (_tagName == "TEXTAREA") _tagType = TagTypes.TEXTAREA; else if (_tagName == "PRE") _tagType = TagTypes.PRE; else if (_tagName == "STYLE") _tagType = TagTypes.STYLE; else if (_tagName == "SCRIPT") _tagType = TagTypes.SCRIPT; else _tagType = TagTypes.OTHERS; // reset the quote char. _attrQuoteChar = ' '; } else if (_startTag) // we're inside a tag.{ if (!_startAttr) continue; // skip. else // the attribute has started. { // see if we're inside an attribute name. if (_startAttrName) { _startAttrName = false; _endAttrName = true; } else if (_startAttrValue) { // skip '\r' and '\n' in attribute values. if (iscrlf()) continue; // preserve none crlf whitespaces. echo(); } else if (_startAttrValueNoQuotes) { _startAttrValueNoQuotes = false; _startAttr = false; } } } else if (_startCDATA) { echo(); } else if (_startWsContent) { echo(); if (_lookAheadStartJsComment) { _lookAheadStartJsComment = false; } else if (_startJsComment && !_isJsBlockComment && iscrlf()) { _startJsComment = false; _isJsBlockComment = false; } } else if (_lookForNextTag) // othercase, we collect the text to a temporary buffer.{ _textNodeContent.Append(_tok); } } break; default: { generic_handler(); } break; } } } // handle generic token. private void generic_handler() { if (_lookAheadEndOfTagGt) { echo('/'); _lookAheadEndOfTagGt = false; } if (_startlt) { _startlt = false; _startltPos = -1; _startTagName = true; // look for tag name. // append pending text node text. if (_lookForNextTag) { echo_text_node_content(); } _tagNameBuf.Remove(0, _tagNameBuf.Length); echo('<'); _tagNameBuf.Append(_tok); echo(); } else if (_startTagName) { _tagNameBuf.Append(_tok); echo(); } else if (_startTag) { if (!_startAttr) { echo(' '); // add a space between attributes (or attribute and tag) _startAttr = true; echo(); _startAttrName = true; } else // inside an attribute. { // in this case, this is an attribute w/out value, such <input checked... >. if (_startAttrName) { echo(); } else if (_endAttrName) { echo(' '); // add a space between attributes (or attribute and tag) _startAttr = true; echo(); _startAttrName = true; _endAttrName = false; } else if (_startAttrValue) { echo(); } else if (_startEq && (_startEqPos 1) == pos()) { // this is attribute value a/out quotes. _startAttrValueNoQuotes = true; _startEq = false; _startEqPos = -1; echo(); } else if (_startAttrValueNoQuotes) { echo(); } } } else if (_startCDATA) { echo(); } else if (_startPI) { _startPI = false; _startlt = false; _startltPos = -1; _startTagName = true; // look for tag name. if (_lookForNextTag) { echo_text_node_content(); } _tagNameBuf.Append('!').Append(_tok); echo('<').echo('!').echo(); } else if (_startWsContent) { echo(); if (_lookAheadStartJsComment) { _lookAheadStartJsComment = false; if (_endOfTagSeen) { _endOfTagSeen = false; _startWsEndTag = true; _blockEndTag = true; // begin collect the end tag name. _endTagNameBuf.Remove(0, _endTagNameBuf.Length); _endTagName = string.Empty; } } if (_startWsEndTag) { _endTagNameBuf.Append(_tok); } } else if (_startComment) { /* noop; */ } else if (_lookForNextTag) { _textNodeContent.Append(_tok); } else if (_lookForEndTag) { echo(); } else echo(); } }}
评论